Prediction using normal score for wall street columns using the same data clusters.

Here we will test how the prediction between using mixed receptive fields in time compares with non-time mixed receptive fields where the clusters are the same for each of the times.

First as usual we load everything that we need.


In [1]:
import numpy as np
from sklearn import svm, cross_validation
import h5py

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sys
sys.path.append("../")


/home/heberto/miniconda/envs/nexa/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Without Spaces

Load the code vectors and the features


In [2]:
# Data to use
Ndata = 10000

# First we load the file 
file_location = '../results_database/text_wall_street_columns_indp.hdf5'

# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters_spaces.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)

targets = []

for index in range(Ndata):
    letter_index = index // 10
    targets.append(letters_sequence[letter_index])

# Transform to array
targets = np.array(targets)

Do the loop and calculate the predictions


In [3]:
# Calculate the predictions    
Ntime_clusters_set = np.arange(10, 37, 3)   

scores_mixed = []
scores_indp = []

# Nexa parameters
Nspatial_clusters = 3
Nembedding = 3

In [4]:
for Ntime_clusters in Ntime_clusters_set:
    print(Ntime_clusters)
    # Here calculate the scores for the mixes
    run_name = '/test'
    f = h5py.File(file_location, 'r')

    parameters_string = '/' + str(Nspatial_clusters)
    parameters_string += '-' + str(Ntime_clusters)
    parameters_string += '-' + str(Nembedding)

    nexa = f[run_name + parameters_string]
    cluster_to_index = nexa['cluster_to_index']
    code_vectors_softmax = np.array(nexa['code-vectors-softmax'])

    # Now we need to classify
    X  = code_vectors_softmax[:Ndata]
    y = targets
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf_linear = svm.SVC(C=1.0, kernel='linear')
    clf_linear.fit(X_train, y_train)
    score = clf_linear.score(X_test, y_test) * 100.0
    scores_mixed.append(score)

    # Here calculate the scores for the independent
    run_name = '/indep'
    f = h5py.File(file_location, 'r')
    
    parameters_string = '/' + str(Nspatial_clusters)
    parameters_string += '-' + str(Ntime_clusters)
    parameters_string += '-' + str(Nembedding)

    nexa = f[run_name + parameters_string]
    cluster_to_index = nexa['cluster_to_index']
    code_vectors_softmax = np.array(nexa['code-vectors-softmax'])

    # Now we need to classify
    X  = code_vectors_softmax[:Ndata]
    y = targets
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf_linear = svm.SVC(C=1.0, kernel='linear')
    clf_linear.fit(X_train, y_train)
    score = clf_linear.score(X_test, y_test) * 100.0
    scores_indp.append(score)


10
13
16
19
22
25
28
31
34

In [5]:
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111)
ax.plot(Ntime_clusters_set, scores_indp, 'o-', label='independent', lw=2, markersize=10)
ax.plot(Ntime_clusters_set, scores_mixed, 'o-', label='mixed', lw=2, markersize=10)

ax.set_ylim(0, 105)
ax.set_ylabel('Accuracy')
ax.set_xlabel('Number of Data Clusters')
ax.set_title('Accuracy vs Number of Data Clusters for different features (Without Sapces)')
ax.legend()


Out[5]:
<matplotlib.legend.Legend at 0x7fabdb136f60>

In [6]:
targets[0:20]


Out[6]:
array(['p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'i', 'i', 'i',
       'i', 'i', 'i', 'i', 'i', 'i', 'i'], 
      dtype='<U1')